Example Program
Constraint Iterator
Example for using node predicates on a deferred suffix tree.
Given a sequences, we want to find all substrings s that fulfill certain constraints. The relative probabilty to see s should be at least p_min. s should also be not longer than replen_max. The latter constraint is a anti-monotonic pattern predicate and can be used in conjunction with the first constraint to cut of the trunk of a suffix tree. Only the top of the suffix tree contains candidates that might fulfill both predicates, so we can use an Index based on a deferred suffix tree (see Index_Wotd). The following example demonstrates how to iterate over all suffix tree nodes fulfilling the constraints and output them.
1#include <iostream>
2#include <seqan/index.h>
3
4using namespace std;
5using namespace seqan;
6
constraint parameters
7struct TMyConstraints {
8    double p_min;
9    unsigned int replen_max;
10    bool _cachedPred;
11};
12
SeqAn extensions
13namespace seqan 
14{
15    // custom TSpec for our customized wotd-Index
16    struct TMyIndex;
17
18    template <typename TText>
19    struct Cargo<Index<TText, Index_Wotd<TMyIndex> > > {
20        typedef TMyConstraints Type;
21    };
22
23    // node predicate
24    template <typename TText, typename TSpec>
25    bool nodePredicate(Iter<Index<TText, Index_Wotd<TMyIndex> >, TSpec> &it) 
26    {
27        TMyConstraints &cons = cargo(container(it));
28        unsigned int delta = countSequences(container(it)) * repLength(it);
29        unsigned int textLen = length(container(it));
30
31        if (textLen <= delta) return cons._cachedPred = true;
32        return cons._cachedPred = 
33            ((double)countOccurrences(it) / (double)(textLen - delta)) > cons.p_min;
34    }
35
36    // monotonic hull
37    template <typename TText, typename TSpec>
38    bool nodeHullPredicate(Iter<Index<TText, Index_Wotd<TMyIndex> >, TSpec> &it) 
39    {
40        TMyConstraints const &cons = cargo(container(it));
41        unsigned int textLen = length(container(it));
42
43        if (repLength(it) > cons.replen_max) return false;
44
45        unsigned int delta = countSequences(container(it)) * cons.replen_max;
46        if (textLen <= delta) return true;
47        return ((double)countOccurrences(it) / 
48                (double)(textLen - delta)) > cons.p_min;
49    }
50}
51
52int main ()
53{
We begin with a String to store our sequence.
54    String<char> myString = "How many wood would a woodchuck chuck.";
55
Then we create our customized index which is a specialization of the deferred wotd-Index
56    typedef Index< String<char>, Index_Wotd<TMyIndex> > TMyIndex;
57    TMyIndex myIndex(myString);
58
59    cargo(myIndex).replen_max = 10;
60    cargo(myIndex).p_min = 0.05;
61
To find all strings that fulfill our constraints, we simply do a dfs-traversal via goBegin and goNext
62    typedef Iterator< TMyIndex, TopDown<ParentLinks<> > >::Type TConstrainedIterator;
63    TConstrainedIterator myConstrainedIterator(myIndex);
64
65    goBegin(myConstrainedIterator);
66    while (!atEnd(myConstrainedIterator))
67    {
68
countOccurrences returns the number of hits of the representative.
69        cout << countOccurrences(myConstrainedIterator) << "x  ";
70
The representative string can be determined with representative
71        cout << "\t\"" << representative(myConstrainedIterator) << '\"' << endl;
72
73        goNext(myConstrainedIterator);
74    }
75
76    return 0;
77}
Output
weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_node_predicate
weese@tanne:~/seqan/demos$ ./index_node_predicate
38x     ""
6x      " "
3x      " wo"
2x      " wood"
2x      "a"
4x      "c"
2x      "chuck"
2x      "ck"
3x      "d"
2x      "d "
2x      "huck"
2x      "k"
6x      "o"
2x      "od"
2x      "ood"
3x      "u"
2x      "uck"
4x      "w"
3x      "wo"
2x      "wood"
weese@tanne:~/seqan/demos$
SeqAn - Sequence Analysis Library - www.seqan.de